Normal Table HET SNP Frequency Analysis

targeted_normal_read_table <- readRDS("../Gaudin_thesis_data/normal_reads_table.rds")
SNP_targets_info <- read.table("../Gaudin_thesis_data/SNP_targets_info.csv", stringsAsFactors = F, header = FALSE, sep="\t")
SNP_targets_info <- SNP_targets_info[,c(4,5)]
targeted_normal_read_table[[1]]$chrom_and_position <- paste(targeted_normal_read_table[[1]]$Chromosome, ":",targeted_normal_read_table[[1]]$Position, "-",targeted_normal_read_table[[1]]$Position, sep="")
targeted_normal_read_table[[2]]$chrom_and_position <- paste(targeted_normal_read_table[[2]]$Chromosome, ":",targeted_normal_read_table[[2]]$Position, "-",targeted_normal_read_table[[2]]$Position, sep="")
#targeted_normal_read_table[[1]]$chrom_and_position
targeted_normal_read_table[[1]] <- merge(targeted_normal_read_table[[1]], SNP_targets_info, by.x = "chrom_and_position", by.y = "V5")

targeted_normal_read_table[[2]] <- merge(targeted_normal_read_table[[2]], SNP_targets_info, by.x = "chrom_and_position", by.y = "V5")
targeted_tumor_read_table <- readRDS("../Gaudin_thesis_data/tumor_reads_table.rds")

targeted_tumor_read_table[[1]]$chrom_and_position <- paste(targeted_tumor_read_table[[1]]$Chromosome, ":",targeted_tumor_read_table[[1]]$Position, "-",targeted_tumor_read_table[[1]]$Position, sep="")
targeted_tumor_read_table[[2]]$chrom_and_position <- paste(targeted_tumor_read_table[[2]]$Chromosome, ":",targeted_tumor_read_table[[2]]$Position, "-",targeted_tumor_read_table[[2]]$Position, sep="")
#targeted_tumor_read_table[[1]]$chrom_and_position
targeted_tumor_read_table[[1]] <- merge(targeted_tumor_read_table[[1]], SNP_targets_info, by.x = "chrom_and_position", by.y = "V5")

targeted_tumor_read_table[[2]] <- merge(targeted_tumor_read_table[[2]], SNP_targets_info, by.x = "chrom_and_position", by.y = "V5")

Mean VAF for heterozygous SNP sites in normal pool.

st_alt_table <- targeted_normal_read_table[[1]][,5:ncol(targeted_normal_read_table[[1]]) - 1] ## remove chromosome and position
st_cov_table <- targeted_normal_read_table[[2]][,5:ncol(targeted_normal_read_table[[2]]) - 1] ## remove chromosome and position
avg_alts <- c() 
avg_covs <- c()
pool_vector <- c()
sds <- c()
het_count <- c()
for(i in 1:nrow(st_alt_table)){
  indices <- which(st_alt_table[i,] > 5 & st_alt_table[i,] < 95 & st_cov_table[i,] > 250) ##only include SNP sites with VAF frequency between 25 - 75 and coverage > 100, 
  if(length(indices) > 7){ ##only average SNPs that are heterozygous at at least 7 SNP sites
    avg_alts <- c(avg_alts, mean(as.numeric(st_alt_table[i,indices])))
    avg_covs <- c(avg_covs, mean(as.numeric(st_cov_table[i,indices]))) 
    pool_vector <- c(pool_vector, targeted_normal_read_table[[1]]$V4[i])
    sds <- c(sds, sd(as.numeric(st_alt_table[i,indices])))
    het_count <- c(het_count, length(indices))
  }
}
cov_df <- data.frame(Average_vaf = avg_alts, Average_coverage = avg_covs, pool = pool_vector,Standard_devs = sds, het_count = het_count)
cov_df <- cov_df[!is.na(cov_df$Average_vaf),]
print("number of SNPs included")
## [1] "number of SNPs included"
print(nrow(cov_df))## print number of SNPs 
## [1] 487
print(ggplot(cov_df, aes(x=Average_coverage, y=Average_vaf, color = pool)) + xlab("SNP Site Average Coverage") + ylab("Mean VAF for SNP") + geom_point() + geom_vline(aes(xintercept=250), colour="#BB0000", linetype="dashed") + ylim(5,95) + xlim(0,2500))

print(ggplot(cov_df, aes(x=Average_coverage, y=Standard_devs, shape = pool, color = het_count )) + xlab("SNP Site Average Coverage") + ylab("Standard Dev for het SNP") + geom_point() + geom_vline(aes(xintercept=250), colour="#BB0000", linetype="dashed") + xlim(0,2500))

minor_alleles <- c()

for(i in 1:nrow(cov_df)){
  if(cov_df$Average_vaf[i] >= 50){
    minor_alleles <- c(minor_alleles, 100 - cov_df$Average_vaf[i])
  }else{
    minor_alleles <- c(minor_alleles, cov_df$Average_vaf[i])
  }
}

cov_df$minor_allels <- minor_alleles

print(ggplot(cov_df, aes(x=Average_coverage, y=minor_alleles, color = pool)) + xlab("SNP Site Average Coverage") + ylab("Mean VAF for minor allele SNP") + geom_point() + geom_vline(aes(xintercept=250), colour="#BB0000", linetype="dashed"))

For tumor targeted reads

st_alt_table <- targeted_tumor_read_table[[1]][,5:ncol(targeted_tumor_read_table[[1]]) - 1] ## remove chromosome and position and pool 
st_cov_table <- targeted_tumor_read_table[[2]][,5:ncol(targeted_tumor_read_table[[2]]) - 1] ## remove chromosome and position and pool 
avg_alts <- c() 
avg_covs <- c()
pool_vector <- c()
sds <- c()
het_count <- c()
for(i in 1:nrow(st_alt_table)){
  indices <- which(st_alt_table[i,] > 5 & st_alt_table[i,] < 95 & st_cov_table[i,] > 250) ##only include SNP sites with VAF frequency between 25 - 75 and coverage > 100, 
  if(length(indices) > 7){ ##only average SNPs that are heterozygous at at least 7 SNP sites
    avg_alts <- c(avg_alts, mean(as.numeric(st_alt_table[i,indices])))
    avg_covs <- c(avg_covs, mean(as.numeric(st_cov_table[i,indices]))) 
    pool_vector <- c(pool_vector, targeted_tumor_read_table[[1]]$V4[i])
    sds <- c(sds, sd(as.numeric(st_alt_table[i,indices])))
    het_count <- c(het_count, length(indices))
  }
}
cov_df <- data.frame(Average_vaf = avg_alts, Average_coverage = avg_covs, pool = pool_vector,Standard_devs = sds)
cov_df <- cov_df[!is.na(cov_df$Average_vaf),]
print("number of SNPs included")
## [1] "number of SNPs included"
print(nrow(cov_df))## print number of SNPs 
## [1] 512
print(ggplot(cov_df, aes(x=Average_coverage, y=Average_vaf, color = pool)) + xlab("SNP Site Average Coverage") + ylab("Mean VAF for SNP") + geom_point() + geom_vline(aes(xintercept=250), colour="#BB0000", linetype="dashed") + ylim(5,95) + xlim(0,3500))

print(ggplot(cov_df, aes(x=Average_coverage, y=Standard_devs, shape = pool, color = het_count )) + xlab("SNP Site Average Coverage") + ylab("Standard Dev for het SNP") + geom_point() + geom_vline(aes(xintercept=250), colour="#BB0000", linetype="dashed")+ xlim(0,3500))

gene_table <- read.table("GENES_chrom_start_stop.txt", header=TRUE)

##adjust for targeting of genes for +/- 1MB , but dont overlap MSH2 and MSH6
gene_table$Start[-c(4)] <- gene_table$Start[-c(4)] - 1000000
gene_table$Stop[-c(3)] <- gene_table$Stop[-c(3)] + 1000000
print(gene_table)
##     GENE Chromosome     Start      Stop
## 1  BRCA2         13  31889611  33973809
## 2  BRCA1         17  40196312  42277500
## 3   MSH2          2  46630108  47789450
## 4   MSH6          2  47922669  49037240
## 5   MLH1          3  36034823  38107380
## 6   PMS2          7   5012870   7048756
## 7   CDK4         12  57141510  59149796
## 8   MDM2         12  68201956  70239214
## 9  ERBB2         17  36844167  38886697
## 10  EGFR          7  54086714  56324313
## 11   MET          7 115312444 117438440
## 12   MYC          8 127747680 129753680

Histograms showing distributions of 20 targeted SNPs within EGFR (left) and BRCA1 (right) genes in normal samples (n = 47).

normal_targeted_het_counts <- gene_het_count_table(gene_table,targeted_normal_read_table,25,75,100)
for(i in 1:ncol(normal_targeted_het_counts)){
  print(hist(normal_targeted_het_counts[,i], breaks = max(normal_targeted_het_counts[,i]), main=paste("Heterozygous SNPs for Each Sample (n = 47) Targeting", colnames(normal_targeted_het_counts)[i]), xlab="Number of Heterozygous SNPs", ylab="Number of Samples"))
}

## $breaks
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14
## 
## $counts
##  [1] 10  1  2  1  4  3  2  3  2  2  8  3  4  4
## 
## $density
##  [1] 0.20408163 0.02040816 0.04081633 0.02040816 0.08163265 0.06122449
##  [7] 0.04081633 0.06122449 0.04081633 0.04081633 0.16326531 0.06122449
## [13] 0.08163265 0.08163265
## 
## $mids
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5 13.5
## 
## $xname
## [1] "normal_targeted_het_counts[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

## $breaks
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
## 
## $counts
##  [1] 22  4  4  0  0  0  0  0  0  0  0  1  1  1 16
## 
## $density
##  [1] 0.44897959 0.08163265 0.08163265 0.00000000 0.00000000 0.00000000
##  [7] 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.02040816
## [13] 0.02040816 0.02040816 0.32653061
## 
## $mids
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5 13.5
## [15] 14.5
## 
## $xname
## [1] "normal_targeted_het_counts[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

## $breaks
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
## 
## $counts
##  [1] 24  0  0  1  0  0  2  0  0  1  1  1  3  2 14
## 
## $density
##  [1] 0.48979592 0.00000000 0.00000000 0.02040816 0.00000000 0.00000000
##  [7] 0.04081633 0.00000000 0.00000000 0.02040816 0.02040816 0.02040816
## [13] 0.06122449 0.04081633 0.28571429
## 
## $mids
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5 13.5
## [15] 14.5
## 
## $xname
## [1] "normal_targeted_het_counts[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

## $breaks
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13
## 
## $counts
##  [1]  6  0  3 11  2  3  4  6  5  6  1  0  2
## 
## $density
##  [1] 0.12244898 0.00000000 0.06122449 0.22448980 0.04081633 0.06122449
##  [7] 0.08163265 0.12244898 0.10204082 0.12244898 0.02040816 0.00000000
## [13] 0.04081633
## 
## $mids
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5
## 
## $xname
## [1] "normal_targeted_het_counts[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

## $breaks
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17
## 
## $counts
##  [1] 23  4  0  0  1  0  0  0  0  1  0  1  0  1  1 10  7
## 
## $density
##  [1] 0.46938776 0.08163265 0.00000000 0.00000000 0.02040816 0.00000000
##  [7] 0.00000000 0.00000000 0.00000000 0.02040816 0.00000000 0.02040816
## [13] 0.00000000 0.02040816 0.02040816 0.20408163 0.14285714
## 
## $mids
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5 16.5
## 
## $xname
## [1] "normal_targeted_het_counts[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

## $breaks
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13
## 
## $counts
##  [1] 17  0  2  0  3  6  8  7  0  1  1  1  3
## 
## $density
##  [1] 0.34693878 0.00000000 0.04081633 0.00000000 0.06122449 0.12244898
##  [7] 0.16326531 0.14285714 0.00000000 0.02040816 0.02040816 0.02040816
## [13] 0.06122449
## 
## $mids
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5
## 
## $xname
## [1] "normal_targeted_het_counts[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

## $breaks
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18
## 
## $counts
##  [1] 12  1  6  1  2  6  2  1  1  1  2  2  2  1  1  4  1  3
## 
## $density
##  [1] 0.24489796 0.02040816 0.12244898 0.02040816 0.04081633 0.12244898
##  [7] 0.04081633 0.02040816 0.02040816 0.02040816 0.04081633 0.04081633
## [13] 0.04081633 0.02040816 0.02040816 0.08163265 0.02040816 0.06122449
## 
## $mids
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5 16.5 17.5
## 
## $xname
## [1] "normal_targeted_het_counts[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

## $breaks
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13
## 
## $counts
##  [1] 4 2 4 3 4 3 4 8 4 3 4 2 4
## 
## $density
##  [1] 0.08163265 0.04081633 0.08163265 0.06122449 0.08163265 0.06122449
##  [7] 0.08163265 0.16326531 0.08163265 0.06122449 0.08163265 0.04081633
## [13] 0.08163265
## 
## $mids
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5
## 
## $xname
## [1] "normal_targeted_het_counts[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

## $breaks
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19
## 
## $counts
##  [1] 20  7  1  2  0  2  0  2  1  2  0  0  0  0  3  0  5  0  4
## 
## $density
##  [1] 0.40816327 0.14285714 0.02040816 0.04081633 0.00000000 0.04081633
##  [7] 0.00000000 0.04081633 0.02040816 0.04081633 0.00000000 0.00000000
## [13] 0.00000000 0.00000000 0.06122449 0.00000000 0.10204082 0.00000000
## [19] 0.08163265
## 
## $mids
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5 16.5 17.5 18.5
## 
## $xname
## [1] "normal_targeted_het_counts[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

## $breaks
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13
## 
## $counts
##  [1] 3 2 4 5 5 5 8 3 4 5 3 1 1
## 
## $density
##  [1] 0.06122449 0.04081633 0.08163265 0.10204082 0.10204082 0.10204082
##  [7] 0.16326531 0.06122449 0.08163265 0.10204082 0.06122449 0.02040816
## [13] 0.02040816
## 
## $mids
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5
## 
## $xname
## [1] "normal_targeted_het_counts[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

## $breaks
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11
## 
## $counts
##  [1] 16  6  0  3  2  6  1  1  1  8  5
## 
## $density
##  [1] 0.32653061 0.12244898 0.00000000 0.06122449 0.04081633 0.12244898
##  [7] 0.02040816 0.02040816 0.02040816 0.16326531 0.10204082
## 
## $mids
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5
## 
## $xname
## [1] "normal_targeted_het_counts[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

## $breaks
##  [1] 0 1 2 3 4 5 6 7 8 9
## 
## $counts
## [1] 10  5  3  2  2 14  8  2  3
## 
## $density
## [1] 0.20408163 0.10204082 0.06122449 0.04081633 0.04081633 0.28571429
## [7] 0.16326531 0.04081633 0.06122449
## 
## $mids
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5
## 
## $xname
## [1] "normal_targeted_het_counts[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
#normal_targeted_het_counts

Combining individual snp-pileup dat files to form normal and tumor input files (with on and off target reads)

Necessary for large .vaf files with many samples
clin_samples <- read.table("../Gaudin_thesis_data/clin_samples_standard.txt")$V1 ##sample names for clinical files
clin_sample_paths <- paste("../Gaudin_thesis_data/standard/",clin_samples,".dat",sep="")
clin_sample_counts_merged <- merge_count_files(clin_sample_paths) ##merged SNP counts file

normal_samples <- read.table("../Gaudin_thesis_data/nova_standard_names.txt")$V1 ##sample names for normal files
normal_sample_paths <- paste("../Gaudin_thesis_data/nova_standard/",normal_samples,".dat",sep="")
normal_sample_counts_merged <- merge_count_files(normal_sample_paths) ##merged SNP counts file

saveRDS(normal_sample_counts_merged,"normal_table_merged.rds")
saveRDS(clin_sample_counts_merged,"clinical_table_merged.rds")

VAF Z-score Analysis for Tumor and Normal Samples

Load saved count files and sample names
clin_samples <- read.table("../Gaudin_thesis_data/clin_samples_standard.txt")$V1 ##sample names for clinical files
normal_samples <- read.table("../Gaudin_thesis_data/nova_standard_names.txt")$V1 ##sample names for normal files
tumor_countsmerged <- readRDS("../Gaudin_thesis_data/clinical_table_merged.rds") ##load saved counts_merged file
normal_countsmerged <- readRDS("../Gaudin_thesis_data/normal_table_merged.rds") ##load saved counts_merged file
normal_and_clinical_sample_names <- c(as.character(normal_samples),as.character(clin_samples)) ##combine into single list
Calculate Average absolute VAF Z-score for each tumor and normal sample
normal_tables <- format_het_table(normal_countsmerged,5,95) ##[[1]]: VAF frequency [[2]] coverage at SNP site
tumor_tables <- format_het_table(tumor_countsmerged,5,95) ##[[1]]: VAF frequency [[2]] coverage at SNP site
#saveRDS(tumor_tables, "../Gaudin_thesis_data/tumor_reads_table.rds")
#saveRDS(normal_tables, "../Gaudin_thesis_data/normal_reads_table.rds")
complete_vaf_df <- VAF_Z_Score_analysis(normal_tables, tumor_tables, 250, 5, 95, 5, 95, 7,normal_and_clinical_sample_names)
saveRDS(complete_vaf_df,"../Gaudin_thesis_data/cdf_250_5_95.rds")
complete_vaf_df <- readRDS("../Gaudin_thesis_data/cdf_250_5_95.rds")
#complete_vaf_df
gene_specific_z_scores <- gene_specific_tables(gene_table, tumor_tables, normal_tables, clin_samples, 5,95,5,95,7,250)
gene_z_df <- gene_specific_z_scores[[1]]
gene_counts_df <- gene_specific_z_scores[[2]]
#saveRDS(gene_z_df,"../Gaudin_thesis_data/clin_gene_z.rds")
#saveRDS(gene_counts_df,"../Gaudin_thesis_data/clin_gene_counts.rds")
sample_names <- read.table("../Gaudin_thesis_data/nova_clin_names.txt")$V1
sample_files <- read.table("../Gaudin_thesis_data/nova_clin_tlen_paths.txt")$V1

for(i in 1:length(sample_files)){
  frag_lengths <- read.table(as.character(sample_files[i]))$V1
  if(i == 1){
    nt_frags_binsize_5 <- bin_fragments(frag_lengths,5,1,600,as.character(sample_names[i]))
  }else{
    nt_frags_binsize_5 <- merge(nt_frags_binsize_5,bin_fragments(frag_lengths,5,1,600,sample_names[i]), by=c('start','end'))
  }
  print(i)
}

saveRDS(nt_frags_binsize_5,"../Gaudin_thesis_data/nt_clin_frags_binsize_5.rds")
frag_table <- readRDS("../Gaudin_thesis_data/nt_clin_frags_binsize_5.rds")
frag_table$start <- paste("s",frag_table$start,"_",frag_table$end,sep="")
frag_table_starts_ends <- frag_table$start
frag_table <- frag_table[,-c(1,2)]
frag_table <- t(frag_table)
colnames(frag_table) <- frag_table_starts_ends 
frag_table <- as.data.frame(frag_table)
frag_table <- frag_table[,-c(ncol(frag_table))]
frag_table$sample_names <- row.names(frag_table)
frag_table <- normalize_frag_table(frag_table)
fragment_bins_to_analyze <- readRDS("../Gaudin_thesis_data/fragment_bins_to_analyze.rds")
frag_table <- frag_table[,fragment_bins_to_analyze]
#frag_table <- frag_table[,c(c(26:31),c(35:40),c(51:75))] #bins between 125 - 155, 170 - 200, 250 - 375 
frag_table_normal <- frag_table[c(1:47),]
frag_table_clin <- frag_table[c(48:nrow(frag_table)),]
frag_table_normal$sample_names = rownames(frag_table_normal)
frag_table_clin$sample_names = rownames(frag_table_clin)
frag_avg_z_scores <- calculate_frag_z_scores(frag_table_clin,frag_table_normal)
saveRDS(frag_avg_z_scores,"../Gaudin_thesis_data/clin_frag_z_scores_5.rds")
Load data and merge tables
clin_mut_percent_table <- readRDS("../Gaudin_thesis_data/clin_ctDNA_est_table.rds")
clin_sample_key_table <- readRDS("../Gaudin_thesis_data/key_reference.rds") ##data containing other sample identifiers 
complete_vaf_df$sample_names <- str_replace(complete_vaf_df$sample_names, "_DONOR", "-DONOR") ##adjust to fit sample names in other files
cnv_table <- read.table("../Gaudin_thesis_data/data_CNA.txt", header = TRUE) ##table showing called cnvs
####formatting for merging######
colnames(cnv_table) <- str_replace_all(colnames(cnv_table), "[.]", "-")
row.names(cnv_table) <- cnv_table$Hugo_Symbol
cnv_table <- cnv_table[,-c(1)]
cnv_table <- t(cnv_table)
cnv_table <- data.frame(cnv_table) ##convert from atomic vector
##egfr flagged mutaions from cbio
egfr_cnv <- data.frame(Tumor_Sample_Barcode = row.names(cnv_table), egfr_cn = cnv_table$EGFR)
all_data_table <- merge(clin_mut_percent_table, egfr_cnv, all = TRUE)
all_data_table$mean_AF[is.na(all_data_table$mean_AF)] <- 0 
all_data_table  <- merge(all_data_table, clin_sample_key_table, by.x = "Tumor_Sample_Barcode", by.y = "id_1") ##id_1 = tumor sample barcode, id_2 == sample name
all_data_table <- merge(complete_vaf_df,all_data_table,by.x = "sample_names", by.y = "id_2", all.x = TRUE)
all_data_table$mean_AF[is.na(all_data_table$mean_AF)] <- 0 
frag_avg_z_scores$sample_names <- str_replace(frag_avg_z_scores$sample_names, "_DONOR","-DONOR")
all_data_table <- clin_total_table <- merge(all_data_table,frag_avg_z_scores,by = "sample_names")
all_data_table <- all_data_table[-c(which(all_data_table$pool == "tumor" & is.na(all_data_table$Tumor_Sample_Barcode))),] ##remove tumor samples (4) which don't have egfr or barcode data
##add clinical patient IDs
patient_ids <- all_data_table$Tumor_Sample_Barcode 
patient_ids <-str_replace_all(patient_ids, "-T.*", "")
all_data_table$patient_ids <- patient_ids
##add egfr gene specific Z scores
egfr_z_scores <- data.frame(sample_names = gene_z_df$sample_name, egfr_z_score = gene_z_df$EGFR, egfr_het_SNP_count = gene_counts_df$EGFR) 
all_data_table <- merge(all_data_table, egfr_z_scores, by.x = "sample_names", all.x = TRUE)
#all_data_table
##mark samples with same patient id that show copy number flagged in some samples but not others (-1 means flagged for CNV in other patient samples)
updated_egfr_cn <- c()
for(i in 1:nrow(all_data_table)){
  if(is.na(all_data_table$egfr_cn[i])){
    updated_egfr_cn <- c(updated_egfr_cn,NA)
  }
  else{
    if(all_data_table$egfr_cn[i] == 2){
      #print("hi")
      updated_egfr_cn <- c(updated_egfr_cn, 2)
    }
    else{
      if(sum(all_data_table$egfr_cn[which(all_data_table$patient_ids == all_data_table$patient_ids[i])]) > 0){
        updated_egfr_cn <- c(updated_egfr_cn, -1)
      }
      else{
        updated_egfr_cn <- c(updated_egfr_cn, 0)
      }
    } 
  }
}
all_data_table$updated_egfr_cn <- updated_egfr_cn

Boxplot distributions of mean heterozygous SNP VAF absolute Z-scores for normal pool

ggplot(all_data_table, aes(x = pool, y = VAF_Z_Score)) + geom_boxplot() + xlab("Pool") + ylab("Mean Absolute heterozygous SNP VAF Z-score")

#### Boxplot for non-absolute average VAF Z Scores

ggplot(all_data_table, aes(x = pool, y = VAF_Z_NA)) + geom_boxplot() + xlab("Pool") + ylab("Mean Absolute heterozygous SNP VAF Z-score")

Histograms showing distributions of mean heterozygous SNP VAF absolute Z-scores for normal pool
a <- ggplot()  + xlab("Average SNP VAF Z-Score") + ylab("Sample Count")
a <- a + geom_histogram(data = all_data_table[which(all_data_table$pool == "tumor"),], aes(x=VAF_Z_Score, fill=pool, color=pool, position="identity"), bins = 50)
## Warning: Ignoring unknown aesthetics: position
a <- a + geom_histogram(data = all_data_table[which(all_data_table$pool == "normal"),], aes(x=VAF_Z_Score, fill=pool, color=pool, position="identity"), bins = 50)
## Warning: Ignoring unknown aesthetics: position
b <- a + xlim(0.5,2)

print(a)

print(b)
## Warning: Removed 67 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).

## Warning: Removed 2 rows containing missing values (geom_bar).

print("normal min and max")
## [1] "normal min and max"
print(min(all_data_table[which(all_data_table$pool == "normal"),]$VAF_Z_Score))
## [1] 0.7549912
print(max(all_data_table[which(all_data_table$pool == "normal"),]$VAF_Z_Score))
## [1] 1.051833
print("tumor min and max")
## [1] "tumor min and max"
print(min(all_data_table[which(all_data_table$pool == "tumor"),]$VAF_Z_Score))
## [1] 0.6714352
print(max(all_data_table[which(all_data_table$pool == "tumor"),]$VAF_Z_Score))
## [1] 7.854321
Scatter plots showing distributions of mean heterozygous SNP VAF absolute Z-scores for normal pool
p <- ggplot() 
p <- p + geom_point(data = all_data_table[all_data_table$pool == "tumor",], aes(x=mean_AF, y=VAF_Z_Score,color=pool),alpha = 0.5)
p <- p + geom_point(data = all_data_table[all_data_table$pool == "normal",], aes(x=mean_AF, y=VAF_Z_Score,color=pool),alpha = 0.5)
p <- p + xlab("Estimated ctDNA fraction") + ylab("Mean VAF Z-score") + geom_hline(aes(yintercept = max(all_data_table[which(all_data_table$pool == "normal"),]$VAF_Z_Score)), color = "purple", linetype="dashed")
print(p)

g <- p + xlim(0,0.3) + ylim(0.7,2) + geom_vline(aes(xintercept=0.05),color = "dark green", linetype="dashed") + geom_hline(aes(yintercept = max(all_data_table[which(all_data_table$pool == "normal"),]$VAF_Z_Score)), color = "purple", linetype="dashed")
print(g)
## Warning: Removed 70 rows containing missing values (geom_point).

p <- ggplot() 
p <- p + geom_point(data = all_data_table[all_data_table$pool == "tumor",], aes(x=mean_AF, y=q75,color=pool),alpha = 0.5)
p <- p + geom_point(data = all_data_table[all_data_table$pool == "normal",], aes(x=mean_AF, y=q75,color=pool),alpha = 0.5)
p <- p + xlab("Estimated ctDNA fraction") + ylab("Mean VAF Z-score")
print(p)

g <- p + xlim(0,0.3) + ylim(0.7,2)
print(g)
## Warning: Removed 89 rows containing missing values (geom_point).

p <- ggplot() 
p <- p + geom_point(data = all_data_table[all_data_table$pool == "tumor",], aes(x=mean_AF, y=chrom_single_max,color=pool),alpha = 0.5)
p <- p + geom_point(data = all_data_table[all_data_table$pool == "normal",], aes(x=mean_AF, y=chrom_single_max,color=pool),alpha = 0.5)
p <- p + xlab("Estimated ctDNA fraction") + ylab("Mean VAF Z-score")
print(p)

g <- p + xlim(0,0.3) + ylim(0.7,2)
print(g)
## Warning: Removed 159 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).

cor.test(all_data_table$chrom_single_max, all_data_table$mean_AF)
## 
##  Pearson's product-moment correlation
## 
## data:  all_data_table$chrom_single_max and all_data_table$mean_AF
## t = 31.776, df = 517, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7819044 0.8404833
## sample estimates:
##      cor 
## 0.813244
cor.test(all_data_table$chrom_double_max, all_data_table$mean_AF)
## 
##  Pearson's product-moment correlation
## 
## data:  all_data_table$chrom_double_max and all_data_table$mean_AF
## t = 32.516, df = 517, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7891013 0.8459143
## sample estimates:
##       cor 
## 0.8195115
cor.test(all_data_table$chrom_triple_max, all_data_table$mean_AF)
## 
##  Pearson's product-moment correlation
## 
## data:  all_data_table$chrom_triple_max and all_data_table$mean_AF
## t = 32.183, df = 517, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7859007 0.8435005
## sample estimates:
##       cor 
## 0.8167251
cor.test(all_data_table$VAF_Z_Score, all_data_table$mean_AF)
## 
##  Pearson's product-moment correlation
## 
## data:  all_data_table$VAF_Z_Score and all_data_table$mean_AF
## t = 28.277, df = 517, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7430741 0.8109762
## sample estimates:
##       cor 
## 0.7793024
p <- ggplot() 
p <- p + geom_point(data = all_data_table[all_data_table$pool == "tumor",], aes(x=mean_AF, y=chrom_double_max,color=pool),alpha = 0.5)
p <- p + geom_point(data = all_data_table[all_data_table$pool == "normal",], aes(x=mean_AF, y=chrom_double_max,color=pool),alpha = 0.5)
p <- p + xlab("Estimated ctDNA fraction") + ylab("Mean VAF Z-score")
print(p)

g <- p + xlim(0,0.3) + ylim(0.7,2)
print(g)
## Warning: Removed 138 rows containing missing values (geom_point).

p <- ggplot() 
p <- p + geom_point(data = all_data_table[all_data_table$pool == "tumor",], aes(x=mean_AF, y=chrom_triple_max,color=pool),alpha = 0.5)
p <- p + geom_point(data = all_data_table[all_data_table$pool == "normal",], aes(x=mean_AF, y=chrom_triple_max,color=pool),alpha = 0.5)
p <- p + xlab("Estimated ctDNA fraction") + ylab("Mean VAF Z-score") + geom_hline(aes(yintercept = max(all_data_table[which(all_data_table$pool == "normal"),]$chrom_triple_max)), color = "purple", linetype="dashed")
print(p)

g <- p + xlim(0,0.3) + ylim(0.8,2) + geom_vline(aes(xintercept=0.05),color = "dark green", linetype="dashed") + geom_hline(aes(yintercept = max(all_data_table[which(all_data_table$pool == "normal"),]$chrom_triple_max)), color = "purple", linetype="dashed")
print(g)
## Warning: Removed 126 rows containing missing values (geom_point).

print("normal min and max")
## [1] "normal min and max"
print(min(all_data_table[which(all_data_table$pool == "normal"),]$chrom_triple_max))
## [1] 0.9901933
print(max(all_data_table[which(all_data_table$pool == "normal"),]$chrom_triple_max))
## [1] 1.605519
print("tumor min and max")
## [1] "tumor min and max"
print(min(all_data_table[which(all_data_table$pool == "tumor"),]$chrom_triple_max))
## [1] 0.9665942
print(max(all_data_table[which(all_data_table$pool == "tumor"),]$chrom_triple_max))
## [1] 19.39982

Density plot for average genome-wide VAF Z-scores

density_normal <- data.frame(samples = rep("normal", nrow(all_data_table[which(all_data_table$pool == "normal"),])),VAF_Z_Score = all_data_table[which(all_data_table$pool == "normal"),]$VAF_Z_Score)
density_0 <- data.frame(samples = rep("ctDNA = 0", nrow(all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF == 0),])),VAF_Z_Score = all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF == 0),]$VAF_Z_Score)
density_all <- data.frame(samples = rep("All tumor", nrow(all_data_table[which(all_data_table$pool == "tumor"),])),VAF_Z_Score = all_data_table[which(all_data_table$pool == "tumor"),]$VAF_Z_Score)
density_above_0 <- data.frame(samples = rep("ctDNA > 0", nrow(all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0),])),VAF_Z_Score = all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0),]$VAF_Z_Score)
density_above_0.05 <- data.frame(samples = rep("ctDNA > 0.05", nrow(all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0.05),])),VAF_Z_Score = all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0.05),]$VAF_Z_Score)
density_df <- rbind(density_normal, density_all, density_0, density_above_0, density_above_0.05)
#density_df
p <- ggplot(density_df, aes(x=VAF_Z_Score, color = samples)) + geom_density() + xlim(0.5,2)  + scale_color_manual(values=c("tan4", "turquoise3", "mediumorchid2", "lightcoral","yellowgreen"))
p
## Warning: Removed 189 rows containing non-finite values (stat_density).

Receiver operator characteristic curves for predicting samples that came from clinical cancer patient samples or normal samples with VAF Z-score

print("roc_df_0")
## [1] "roc_df_0"
roc_df_0 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF == 0),], "VAF")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 130
## [1] "Number of tumor samples above max normal vaf z"
## [1] 15
## [1] "Sensitivity"
## [1] 0.1153846
## [1] "AUC"
## [1] 0.4495908
print("roc_df_full")
## [1] "roc_df_full"
roc_df_full <- ROC_curve_function(all_data_table, "VAF")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 472
## [1] "Number of tumor samples above max normal vaf z"
## [1] 150
## [1] "Sensitivity"
## [1] 0.3177966
## [1] "AUC"
## [1] 0.6208979
print("roc_df_above_0")
## [1] "roc_df_above_0"
roc_df_above_0 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0 | all_data_table$pool == "normal"),], "VAF")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 342
## [1] "Number of tumor samples above max normal vaf z"
## [1] 135
## [1] "Sensitivity"
## [1] 0.3947368
## [1] "AUC"
## [1] 0.6860147
print("roc_df_above_05")
## [1] "roc_df_above_05"
roc_df_above_05 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0.05 | all_data_table$pool == "normal"),], "VAF")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 107
## [1] "Number of tumor samples above max normal vaf z"
## [1] 102
## [1] "Sensitivity"
## [1] 0.953271
## [1] "AUC"
## [1] 0.9990058
roc_df_0$samples <- rep("ctDNA = 0", nrow(roc_df_0))
roc_df_full$samples <- rep("All",nrow(roc_df_full))
roc_df_above_0$samples <- rep("Above 0.0 ctDNA",nrow(roc_df_above_0))
roc_df_above_05$samples <- rep("Above 0.05 ctDNA",nrow(roc_df_above_05))
roc_df <- rbind(roc_df_0,roc_df_full,roc_df_above_0,roc_df_above_05)
ROC_curve<-ggplot(roc_df, aes(x=False_positive_rate, y=True_positive_rate, color = samples)) + geom_path() + ylim(0,1) + geom_abline(intercept = 0, slope = 1) + xlab("1 - Specificity") + ylab("Sensitivity") + labs(color = "Samples Included")
print(ROC_curve)

density_normal <- data.frame(samples = rep("normal", nrow(all_data_table[which(all_data_table$pool == "normal"),])),chrom_triple_max = all_data_table[which(all_data_table$pool == "normal"),]$chrom_triple_max)
density_0 <- data.frame(samples = rep("ctDNA = 0", nrow(all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF == 0),])),chrom_triple_max = all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF == 0),]$chrom_triple_max)
density_all <- data.frame(samples = rep("All tumor", nrow(all_data_table[which(all_data_table$pool == "tumor"),])),chrom_triple_max = all_data_table[which(all_data_table$pool == "tumor"),]$chrom_triple_max)
density_above_0 <- data.frame(samples = rep("ctDNA > 0", nrow(all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0),])),chrom_triple_max = all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0),]$chrom_triple_max)
density_above_0.05 <- data.frame(samples = rep("ctDNA > 0.05", nrow(all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0.05),])),chrom_triple_max = all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0.05),]$chrom_triple_max)
density_df <- rbind(density_normal, density_all, density_0, density_above_0, density_above_0.05)
#density_df
p <- ggplot(density_df, aes(x=chrom_triple_max, color = samples)) + geom_density() + xlim(0.7,2) + xlab("Mean VAF Z-score for 3 Chromosomes with highest Z-scores ") + scale_color_manual(values=c("tan4", "turquoise3", "mediumorchid2", "lightcoral","yellowgreen"))
p
## Warning: Removed 347 rows containing non-finite values (stat_density).

print("roc_df_0")
## [1] "roc_df_0"
roc_df_0 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF == 0),], "top_3")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 130
## [1] "Number of tumor samples above max normal vaf z"
## [1] 13
## [1] "Sensitivity"
## [1] 0.1
## [1] "AUC"
## [1] 0.5646481
print("roc_df_full")
## [1] "roc_df_full"
roc_df_full <- ROC_curve_function(all_data_table, "top_3")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 472
## [1] "Number of tumor samples above max normal vaf z"
## [1] 159
## [1] "Sensitivity"
## [1] 0.3368644
## [1] "AUC"
## [1] 0.7041111
print("roc_df_above_0")
## [1] "roc_df_above_0"
roc_df_above_0 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0 | all_data_table$pool == "normal"),], "top_3")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 342
## [1] "Number of tumor samples above max normal vaf z"
## [1] 146
## [1] "Sensitivity"
## [1] 0.4269006
## [1] "AUC"
## [1] 0.7571233
print("roc_df_above_05")
## [1] "roc_df_above_05"
roc_df_above_05 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0.05 | all_data_table$pool == "normal"),], "top_3")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 107
## [1] "Number of tumor samples above max normal vaf z"
## [1] 107
## [1] "Sensitivity"
## [1] 1
## [1] "AUC"
## [1] 1
roc_df_0$samples <- rep("ctDNA = 0", nrow(roc_df_0))
roc_df_full$samples <- rep("All",nrow(roc_df_full))
roc_df_above_0$samples <- rep("Above 0.0 ctDNA",nrow(roc_df_above_0))
roc_df_above_05$samples <- rep("Above 0.05 ctDNA",nrow(roc_df_above_05))
roc_df <- rbind(roc_df_0,roc_df_full,roc_df_above_0,roc_df_above_05)
ROC_curve<-ggplot(roc_df, aes(x=False_positive_rate, y=True_positive_rate, color = samples)) + geom_path() + ylim(0,1) + geom_abline(intercept = 0, slope = 1) + xlab("1 - Specificity") + ylab("Sensitivity") + labs(color = "Samples Included")
print(ROC_curve)

Scatter plot distribution for tumor samples (n = 383) with 7 or more heterozygous SNPs within targeted EGFR gene

ggplot(all_data_table[which(all_data_table$pool == "tumor" & all_data_table$egfr_het_SNP_count > 7),], aes(x=mean_AF, y= egfr_z_score, color = as.character(updated_egfr_cn))) + geom_point() + ylab("Mean Z-Score for EGFR Heterozygous SNPs") + xlab("Estimated ctDNA Fraction") + ggtitle("EGFR Mean SNP VAF Z-Score vs Predicted ctDNA Fraction and CNV") + scale_color_discrete(name = "Predicted CNV Amplification", labels = c("Amplified in other samples from same patient", "No amplification", "Amplification"), l = 70, c = 150) 

print("number of SNPs")
## [1] "number of SNPs"
print(length(which(all_data_table$egfr_het_SNP_count > 7)))
## [1] 401
print("number of SNPs from patients with no flagged EGFR cnv")
## [1] "number of SNPs from patients with no flagged EGFR cnv"
print(length(which(all_data_table$egfr_het_SNP_count > 7 & all_data_table$updated_egfr_cn == 0)))
## [1] 390
print("number of SNPs from samples with flagged EGFR cnv")
## [1] "number of SNPs from samples with flagged EGFR cnv"
print(length(which(all_data_table$egfr_het_SNP_count > 7 & all_data_table$updated_egfr_cn == 2)))
## [1] 9
print("number of SNPs from samples with no flagged EGFR cnv but patients with flagged egfr CN in other samples")
## [1] "number of SNPs from samples with no flagged EGFR cnv but patients with flagged egfr CN in other samples"
print(length(which(all_data_table$egfr_het_SNP_count > 7 & all_data_table$updated_egfr_cn == -1)))
## [1] 2

Histograms showing distributions of mean fragment length absolute Z-scores for normal pool

a <- ggplot()  + xlab("Average SNP VAF Z-Score") + ylab("Sample Count")
a <- a + geom_histogram(data = all_data_table[which(all_data_table$pool == "tumor"),], aes(x=abs_frag_z_score, fill=pool, color=pool, position="identity"))
## Warning: Ignoring unknown aesthetics: position
a <- a + geom_histogram(data = all_data_table[which(all_data_table$pool == "normal"),], aes(x=abs_frag_z_score, fill=pool, color=pool, position="identity"),alpha=0.8)  + xlab("Mean Fragment Length Z-Score")
## Warning: Ignoring unknown aesthetics: position
ggplot(all_data_table, aes(x=abs_frag_z_score, fill=pool, color=pool), bins = 90) + geom_histogram(position="identity", alpha=0.5) + theme(legend.position="top") + xlab("Mean Fragment Length Z-Score") + ylab("Sample Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(all_data_table, aes(x=abs_frag_z_score, fill=pool, color=pool), bins = 90) + geom_histogram(position="identity", alpha=0.5) +
  theme(legend.position="top") + xlim(0.1,3) + xlab("Mean Fragment Length Z-Score") + ylab("Sample Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 61 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_bar).

print(a)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

print(b)
## Warning: Removed 67 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).

## Warning: Removed 2 rows containing missing values (geom_bar).

print("normal min and max")
## [1] "normal min and max"
print(min(all_data_table[which(all_data_table$pool == "normal"),]$abs_frag_z_score))
## [1] 0.191482
print(max(all_data_table[which(all_data_table$pool == "normal"),]$abs_frag_z_score))
## [1] 2.478853
print("tumor min and max")
## [1] "tumor min and max"
print(min(all_data_table[which(all_data_table$pool == "tumor"),]$abs_frag_z_score))
## [1] 0.2660128
print(max(all_data_table[which(all_data_table$pool == "tumor"),]$abs_frag_z_score))
## [1] 7.053137

Scatter plots showing distributions of mean absolute fragment length Z-scores for normal pool

p <- ggplot() 
p <- p + geom_point(data = clin_total_table[clin_total_table$pool == "tumor",], aes(x=mean_AF, y=abs_frag_z_score,color=pool),alpha = 0.5)
p <- p + geom_point(data = clin_total_table[clin_total_table$pool == "normal",], aes(x=mean_AF, y=abs_frag_z_score,color=pool),alpha = 0.5)
p <- p + xlab("Estimated ctDNA fraction") + ylab("Mean Fragment Length Z-score") + geom_hline(aes(yintercept = max(all_data_table[which(all_data_table$pool == "normal"),]$abs_frag_z_score)), color = "purple", linetype="dashed")
print(p)

g <- p + xlim(0,0.2) + ylim(0.1,3)  + geom_vline(aes(xintercept=0.05),color = "dark green", linetype="dashed") + geom_hline(aes(yintercept = max(all_data_table[which(all_data_table$pool == "normal"),]$abs_frag_z_score)), color = "purple", linetype="dashed")
print(g)
## Warning: Removed 71 rows containing missing values (geom_point).

Density plot for Frag Length Z-scores

density_normal <- data.frame(samples = rep("normal", nrow(all_data_table[which(all_data_table$pool == "normal"),])),abs_frag_z_score = all_data_table[which(all_data_table$pool == "normal"),]$abs_frag_z_score)
density_0 <- data.frame(samples = rep("ctDNA = 0", nrow(all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF == 0),])),abs_frag_z_score = all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF == 0),]$abs_frag_z_score)
density_all <- data.frame(samples = rep("All tumor", nrow(all_data_table[which(all_data_table$pool == "tumor"),])),abs_frag_z_score = all_data_table[which(all_data_table$pool == "tumor"),]$abs_frag_z_score)
density_above_0 <- data.frame(samples = rep("ctDNA > 0", nrow(all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0),])),abs_frag_z_score = all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0),]$abs_frag_z_score)
density_above_0.05 <- data.frame(samples = rep("ctDNA > 0.05", nrow(all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0.05),])),abs_frag_z_score = all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0.05),]$abs_frag_z_score)
density_df <- rbind(density_normal, density_all, density_0, density_above_0, density_above_0.05)
#density_df
p <- ggplot(density_df, aes(x=abs_frag_z_score, color = samples)) + geom_density() + xlim(0,4) + scale_color_manual(values=c("tan4", "turquoise3", "mediumorchid2", "lightcoral","yellowgreen")) + xlab("Fragment Length Z-score")
p
## Warning: Removed 64 rows containing non-finite values (stat_density).

#### ROC curves for classifying samples as clinical cancer patient samples or normal samples

print("roc_df_0")
## [1] "roc_df_0"
roc_df_0 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF == 0),], "fragment_length")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 130
## [1] "Number of tumor samples above max frag z"
## [1] 12
## [1] "Sensitivity"
## [1] 0.09230769
## [1] "AUC"
## [1] 0.7607201
print("roc_df_full")
## [1] "roc_df_full"
roc_df_full <- ROC_curve_function(all_data_table, "fragment_length")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 472
## [1] "Number of tumor samples above max frag z"
## [1] 91
## [1] "Sensitivity"
## [1] 0.1927966
## [1] "AUC"
## [1] 0.8262261
print("roc_df_above_0")
## [1] "roc_df_above_0"
roc_df_above_0 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0 | all_data_table$pool == "normal"),], "fragment_length")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 342
## [1] "Number of tumor samples above max frag z"
## [1] 79
## [1] "Sensitivity"
## [1] 0.2309942
## [1] "AUC"
## [1] 0.851126
print("roc_df_above_05")
## [1] "roc_df_above_05"
roc_df_above_05 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0.05 | all_data_table$pool == "normal"),], "fragment_length")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 107
## [1] "Number of tumor samples above max frag z"
## [1] 47
## [1] "Sensitivity"
## [1] 0.4392523
## [1] "AUC"
## [1] 0.9210579
roc_df_0$samples <- rep("ctDNA = 0", nrow(roc_df_0))
roc_df_full$samples <- rep("All",nrow(roc_df_full))
roc_df_above_0$samples <- rep("Above 0.0 ctDNA",nrow(roc_df_above_0))
roc_df_above_05$samples <- rep("Above 0.05 ctDNA",nrow(roc_df_above_05))
roc_df <- rbind(roc_df_0,roc_df_full,roc_df_above_0,roc_df_above_05)
ROC_curve<-ggplot(roc_df, aes(x=False_positive_rate, y=True_positive_rate, color = samples)) + geom_path() + ylim(0,1) + geom_abline(intercept = 0, slope = 1) + xlab("1 - Specificity") + ylab("Sensitivity") + labs(color = "Samples Included")
print(ROC_curve)

Scatter plot demonstrating the correlation between mean SNP VAF Z-scores, mean fragment length Z-scores, and estimated ctDNA fraction

print(ggplot(all_data_table[which(all_data_table$pool == "tumor"),], aes(x=abs_frag_z_score, y=VAF_Z_Score, color = mean_AF)) + geom_point() + ylab("Mean SNP VAF Z-score") + xlab("Mean Fragment Length Z-Score") + labs(color = "Estimated ctDNA Fraction"))

print("Correlation for VAF Z-score and Frag Z Score in all samples")
## [1] "Correlation for VAF Z-score and Frag Z Score in all samples"
print(cor(all_data_table$VAF_Z_Score, all_data_table$abs_frag_z_score))
## [1] 0.5107707
print("Correlation for VAF Z-score and Frag Z Score in tumor samples")
## [1] "Correlation for VAF Z-score and Frag Z Score in tumor samples"
print(cor(all_data_table[which(all_data_table$pool == "tumor"),]$VAF_Z_Score, all_data_table[which(all_data_table$pool == "tumor"),]$abs_frag_z_score))
## [1] 0.5041213
print("Correlation between VAF Z-score and estimated ctDNA fraction")
## [1] "Correlation between VAF Z-score and estimated ctDNA fraction"
print(cor(all_data_table[which(all_data_table$pool == "tumor"),]$VAF_Z_Score, all_data_table[which(all_data_table$pool == "tumor"),]$mean_AF))
## [1] 0.775439
print("Correlation between fragment length Z-score and estimated ctDNA fraction")
## [1] "Correlation between fragment length Z-score and estimated ctDNA fraction"
print(cor(all_data_table[which(all_data_table$pool == "tumor"),]$abs_frag_z_score, all_data_table[which(all_data_table$pool == "tumor"),]$mean_AF))
## [1] 0.5082742
print("Correlation for top 3 chrom VAF and meanAF in all samples")
## [1] "Correlation for top 3 chrom VAF and meanAF in all samples"
print(cor(all_data_table$chrom_triple_max, all_data_table$mean_AF))
## [1] 0.8167251
print("Correlation between VAF Z-score and estimated ctDNA fraction")
## [1] "Correlation between VAF Z-score and estimated ctDNA fraction"
print(cor(all_data_table$VAF_Z_Score, all_data_table$mean_AF))
## [1] 0.7793024
print("Correlation between fragment length Z-score and estimated ctDNA fraction")
## [1] "Correlation between fragment length Z-score and estimated ctDNA fraction"
print(cor(all_data_table$abs_frag_z_score, all_data_table$mean_AF))
## [1] 0.5171402

Scatter plot demonstrating the correlation between mean SNP VAF Z-scores, and mean fragment length Z-scores, and for both normal

f <- ggplot() + geom_point(data = all_data_table[which(all_data_table$pool == "tumor"),], aes(x=abs_frag_z_score, y=VAF_Z_Score, color = pool)) + ylab("Mean SNP VAF Z-score") + xlab("Mean Fragment Length Z-Score") + labs(color = "Pool")
f <- f + geom_point(data = all_data_table[which(all_data_table$pool == "normal"),], aes(x=abs_frag_z_score, y=VAF_Z_Score, color = pool))
print(f)

ROC curves for classifying samples as clinical cancer patient samples or normal samples (n = 47) using VAF Z-score, top 3 VAF, and fragment length Z-score

print("roc_df_0")
## [1] "roc_df_0"
roc_df_0 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF == 0),], "frag_with_VAF_cutoff")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 130
## [1] "Number of tumor samples above max frag z or max VAF frag z"
## [1] 24
## [1] "Sensitivity"
## [1] 0.1846154
## [1] "AUC"
## [1] 0.7818331
print("roc_df_full")
## [1] "roc_df_full"
roc_df_full <- ROC_curve_function(all_data_table, "frag_with_VAF_cutoff")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 472
## [1] "Number of tumor samples above max frag z or max VAF frag z"
## [1] 199
## [1] "Sensitivity"
## [1] 0.4216102
## [1] "AUC"
## [1] 0.8588172
print("roc_df_above_0")
## [1] "roc_df_above_0"
roc_df_above_0 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0 | all_data_table$pool == "normal"),], "frag_with_VAF_cutoff")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 342
## [1] "Number of tumor samples above max frag z or max VAF frag z"
## [1] 175
## [1] "Sensitivity"
## [1] 0.5116959
## [1] "AUC"
## [1] 0.8880801
print("roc_df_above_05")
## [1] "roc_df_above_05"
roc_df_above_05 <- ROC_curve_function(all_data_table[which(all_data_table$mean_AF > 0.05 | all_data_table$pool == "normal"),], "frag_with_VAF_cutoff")
## [1] "ctDNA flagged positive at 100% specificity"
## [1] "Number of tumor samples"
## [1] 107
## [1] "Number of tumor samples above max frag z or max VAF frag z"
## [1] 107
## [1] "Sensitivity"
## [1] 1
## [1] "AUC"
## [1] 1
roc_df_0$samples <- rep("ctDNA = 0", nrow(roc_df_0))
roc_df_full$samples <- rep("All",nrow(roc_df_full))
roc_df_above_0$samples <- rep("Above 0.0 ctDNA",nrow(roc_df_above_0))
roc_df_above_05$samples <- rep("Above 0.05 ctDNA",nrow(roc_df_above_05))
roc_df <- rbind(roc_df_0,roc_df_full,roc_df_above_0,roc_df_above_05)
ROC_curve<-ggplot(roc_df, aes(x=False_positive_rate, y=True_positive_rate, color = samples)) + geom_path() + ylim(0,1) + geom_abline(intercept = 0, slope = 1) + xlab("1 - Specificity") + ylab("Sensitivity") + labs(color = "Samples Included")
print(ROC_curve)

Average fraction of total cfDNA for fragment lengths of normal pool samples (n = 47) and a). clinical, tumor pool samples (n = 472) b). clinical, tumor pool samples with estimated ctDNA fraction above 0 (n = 342) c). clinical, tumor pool samples with estimated ctDNA fraction above 0.05 (n = 107)

frag_table <- readRDS("../Gaudin_thesis_data/nt_clin_frags_binsize_5.rds")
frag_table$start <- paste("s",frag_table$start,"_",frag_table$end,sep="")
frag_table_starts_ends <- frag_table$start
frag_table <- frag_table[,-c(1,2)]
frag_table <- t(frag_table)
colnames(frag_table) <- frag_table_starts_ends 
frag_table <- as.data.frame(frag_table)
frag_table <- frag_table[,-c(ncol(frag_table))]
frag_table$sample_names <- row.names(frag_table)
frag_table <- normalize_frag_table(frag_table)
frag_table$sample_names <- str_replace(frag_table$sample_names, "_DONOR", "-DONOR")
normal_indices <- which(frag_table$sample_names %in% all_data_table[which(all_data_table$pool == "normal"),]$sample_names)
tumor_all_indices <- which(frag_table$sample_names %in% all_data_table[which(all_data_table$pool == "tumor"),]$sample_names)
tumor_above_0_indices <- which(frag_table$sample_names %in% all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0),]$sample_names)
tumor_above_05_indices <- which(frag_table$sample_names %in% all_data_table[which(all_data_table$pool == "tumor" & all_data_table$mean_AF > 0.05),]$sample_names)

normal_frag_table <- frag_table[normal_indices,-c(ncol(frag_table))]
clin_table_all <- frag_table[tumor_all_indices,-c(ncol(frag_table))]
normal_avgs <- colSums(normal_frag_table)/nrow(normal_frag_table)
clin_avgs_all <- colSums(clin_table_all)/nrow(clin_table_all)
all_avgs <- c(normal_avgs,clin_avgs_all)

clin_table_above_0 <- frag_table[tumor_above_0_indices,-c(ncol(frag_table))]
clin_avgs_above_0 <- colSums(clin_table_above_0)/nrow(clin_table_above_0)
clin_table_above_05 <- frag_table[tumor_above_05_indices,-c(ncol(frag_table))]
clin_avgs_above_05 <- colSums(clin_table_above_05)/nrow(clin_table_above_05)

all_avgs_above_0 <-  c(normal_avgs,clin_avgs_above_0)
all_avgs_above_05 <-  c(normal_avgs,clin_avgs_above_05)

repeated <- length(all_avgs)/2
normal_names <- rep("normal",repeated)
tumor_names <- rep("tumor",repeated)
all_names <- c(normal_names,tumor_names)
intervals <- c(seq(1,595,5),seq(1,595,5))
fragments_df_all <- data.frame(avg_coverage = all_avgs, pool = all_names, bin = intervals)
fragments_df_above_0 <- data.frame(avg_coverage = all_avgs_above_0, pool = all_names, bin = intervals)
fragments_df_above_05 <- data.frame(avg_coverage = all_avgs_above_05, pool = all_names, bin = intervals)

print(ggplot(fragments_df_all, aes(bin, avg_coverage, colour = pool)) + geom_path() + ylab("Fraction cfDNA") + xlab("Fragment Length") + ggtitle("Average Fragment Length Distributions for HiSeq Normals and Ret Tumor Samples") + xlim(0,600))

print(ggplot(fragments_df_above_0, aes(bin, avg_coverage, colour = pool)) + geom_path() + ylab("Fraction cfDNA") + xlab("Fragment Length") + ggtitle("Average Fragment Length Distributions for HiSeq Normals and Ret Tumor Samples") + xlim(0,600))

print(ggplot(fragments_df_above_05, aes(bin, avg_coverage, colour = pool)) + geom_path() + ylab("Fraction cfDNA") + xlab("Fragment Length") + ggtitle("Average Fragment Length Distributions for HiSeq Normals and Ret Tumor Samples") + xlim(0,600))

print(ks.test(normal_avgs, clin_avgs_all))
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  normal_avgs and clin_avgs_all
## D = 0.16807, p-value = 0.06937
## alternative hypothesis: two-sided
print(ks.test(normal_avgs, clin_avgs_above_0))
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  normal_avgs and clin_avgs_above_0
## D = 0.16807, p-value = 0.06937
## alternative hypothesis: two-sided
print(ks.test(normal_avgs, clin_avgs_above_05))
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  normal_avgs and clin_avgs_above_05
## D = 0.14286, p-value = 0.1762
## alternative hypothesis: two-sided
#colnames(normal_frag_table) == colnames(clin_table_all)
p_values <- c()
for(i in 1:ncol(normal_frag_table)){
  x <- normal_frag_table[,i]
  y <- clin_table_above_05[,i]
  v <- ks.test(x, y)
  p_values <- c(p_values, v$p.value)
}
#p_values

intervals_p <- seq(1,595,5)

p_cutoff <- 0.05 / length(intervals_p) ##hypothesis correction

abline <- -log( p_cutoff, 10)
fragments_p_values <- data.frame(p_value = p_values, bin = intervals_p)
print(ggplot(fragments_p_values, aes(bin,-log( p_values, 10))) + geom_path() + ylab("-LOG10(p-value)") + xlab("Fragment Length") + geom_hline(yintercept = abline))

#fragments_p_values

#fragments_p_values[which(fragments_p_values$p_value < p_cutoff & normal_avgs > 0.001),] ##abaove .1% for bin covg

fragment_bins_to_analyze <- which(fragments_p_values$p_value < p_cutoff & normal_avgs > 0.001)

fragment_above_001 <- which(normal_avgs > 0.001)

##change p 
saveRDS(fragment_bins_to_analyze, "../Gaudin_thesis_data/fragment_bins_to_analyze.rds")

TSNE and PCA plots showing separation between normal (red) and clinical samples (blue).

TSNE
tsne_table_sd_scaled <- frag_table[c(normal_indices,tumor_all_indices),fragment_above_001]
#tsne_table_sd_scaled <- frag_table[,fragment_bins_to_analyze]
for(i in 1:ncol(tsne_table_sd_scaled)){
min <- min(tsne_table_sd_scaled[,i])
max <- max(tsne_table_sd_scaled[,i])
max_m_min <- max - min
tsne_table_sd_scaled[,i] <- (tsne_table_sd_scaled[,i] - min)/max_m_min 
}
value_vector <- rep("normal",nrow(frag_table))
value_vector[tumor_all_indices] <- "ctDNA_est_0"
value_vector[tumor_above_0_indices] <- "ctDNA_above_0"
value_vector[tumor_above_05_indices] <- "ctDNA_above_05"
value_vector <- value_vector[c(normal_indices,tumor_all_indices)]
#make colors and labels vectors
color_vector <- value_vector
color_vector[which(color_vector == "normal")] <- "2" #red
color_vector[which(color_vector == "ctDNA_est_0")] <- "3" #green
color_vector[which(color_vector == "ctDNA_above_0")] <- "4" #blue
color_vector[which(color_vector == "ctDNA_above_05")] <-  "1" #black
color_vector <- as.numeric(color_vector)
label_vector <- color_vector
label_vector[] <- "*"
#make TSNE
train <- as.matrix(tsne_table_sd_scaled)
tsne <- Rtsne(train, dims = 2, perplexity=30, verbose=FALSE, max_iter = 3000, check_duplicates=FALSE)
plot(tsne$Y, t='n', main="tsne")
text(tsne$Y, labels = label_vector, col = color_vector)
##may need to adjust legend position (x, y are first two parameters)
legend(-20, -5, legend=c("normal", "ctDNA_est_0","ctDNA_above_0", "ctDNA_above_05" ),col=c("red", "green", "blue", "black"), lwd = 1, box.lty=0, box.lwd=0)

###### PCA

pca <- prcomp(t(tsne_table_sd_scaled), scale = FALSE)
pca <- as.data.frame(pca[2]$rotation)
pca$status <- value_vector
print(ggplot(pca) + geom_point(aes(PC1,PC2, fill = status, color = status)))

Logistic Regression

All samples
log_vf_frag_df <- data.frame(pool = all_data_table[which(all_data_table$mean_AF == 0),]$pool, VAF_Z_Score = all_data_table[which(all_data_table$mean_AF == 0),]$VAF_Z_Score, abs_frag_z_score = all_data_table[which(all_data_table$mean_AF == 0),]$abs_frag_z_score, chrom_triple_max = all_data_table[which(all_data_table$mean_AF == 0),]$chrom_triple_max)
glm.fit <- glm(pool~ VAF_Z_Score + abs_frag_z_score + chrom_triple_max, data = log_vf_frag_df, family = binomial)
glm.probs <- predict(glm.fit,type = "response")
d <- roc(pool ~ glm.probs, data = log_vf_frag_df, print.auc=TRUE)
## Setting levels: control = normal, case = tumor
## Setting direction: controls < cases
auc(pool ~ glm.probs, data = log_vf_frag_df, print.auc=TRUE)
## Setting levels: control = normal, case = tumor
## Setting direction: controls < cases
## Area under the curve: 0.7915
ci.se(d,specificities = c(1))
## 95% CI (2000 stratified bootstrap replicates):
##  sp se.low se.median se.high
##   1 0.1077    0.1769     0.4
#plot(d, asp = NA, xlim=c(1, 0), ylim = c(0,1), col="purple")
log_vf_frag_df <- data.frame(pool = all_data_table$pool, VAF_Z_Score = all_data_table$VAF_Z_Score, abs_frag_z_score = all_data_table$abs_frag_z_score, chrom_triple_max = all_data_table$chrom_triple_max)
glm.fit <- glm(pool~ VAF_Z_Score + abs_frag_z_score + chrom_triple_max, data = log_vf_frag_df, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm.probs <- predict(glm.fit,type = "response")
e <- roc(pool ~ glm.probs, data = log_vf_frag_df, print.auc=TRUE)
## Setting levels: control = normal, case = tumor
## Setting direction: controls < cases
auc(pool ~ glm.probs, data = log_vf_frag_df, print.auc=TRUE)
## Setting levels: control = normal, case = tumor
## Setting direction: controls < cases
## Area under the curve: 0.8572
ci.se(e,specificities = c(1))
## 95% CI (2000 stratified bootstrap replicates):
##  sp se.low se.median se.high
##   1 0.3432    0.4004  0.5381
Above 0.0
log_vf_frag_df <- data.frame(pool = all_data_table[which(all_data_table$pool == "normal" | all_data_table$mean_AF > 0.0),]$pool, VAF_Z_Score = all_data_table[which(all_data_table$pool == "normal" | all_data_table$mean_AF > 0.0),]$VAF_Z_Score, abs_frag_z_score = all_data_table[which(all_data_table$pool == "normal" | all_data_table$mean_AF > 0.0),]$abs_frag_z_score, chrom_triple_max = all_data_table[which(all_data_table$pool == "normal" | all_data_table$mean_AF > 0.0),]$chrom_triple_max)
glm.fit <- glm(pool~ VAF_Z_Score + abs_frag_z_score + chrom_triple_max, data = log_vf_frag_df, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm.probs <- predict(glm.fit,type = "response")
f <- roc(pool ~ glm.probs, data = log_vf_frag_df, print.auc=TRUE)
## Setting levels: control = normal, case = tumor
## Setting direction: controls < cases
auc(pool ~ glm.probs, data = log_vf_frag_df, print.auc=TRUE)
## Setting levels: control = normal, case = tumor
## Setting direction: controls < cases
## Area under the curve: 0.8866
ci.se(f,specificities = c(1))
## 95% CI (2000 stratified bootstrap replicates):
##  sp se.low se.median se.high
##   1 0.4269    0.4912  0.6374
Above 0.05
log_vf_frag_df <- data.frame(pool = all_data_table[which(all_data_table$pool == "normal" | all_data_table$mean_AF > 0.05),]$pool, VAF_Z_Score = all_data_table[which(all_data_table$pool == "normal" | all_data_table$mean_AF > 0.05),]$VAF_Z_Score, abs_frag_z_score = all_data_table[which(all_data_table$pool == "normal" | all_data_table$mean_AF > 0.05),]$abs_frag_z_score, chrom_triple_max = all_data_table[which(all_data_table$pool == "normal" | all_data_table$mean_AF > 0.05),]$chrom_triple_max)
glm.fit <- glm(pool~ VAF_Z_Score + abs_frag_z_score + chrom_triple_max, data = log_vf_frag_df, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm.probs <- predict(glm.fit,type = "response")
g <- roc(pool ~ glm.probs, data = log_vf_frag_df, print.auc=TRUE)
## Setting levels: control = normal, case = tumor
## Setting direction: controls < cases
auc(pool ~ glm.probs, data = log_vf_frag_df, print.auc=TRUE)
## Setting levels: control = normal, case = tumor
## Setting direction: controls < cases
## Area under the curve: 1
plot(d, asp = NA, xlim=c(1, 0), ylim = c(0,1), col="purple") 
lines(g, col="green",lty=1)
lines(e, col="red",lty=1)
lines(f, col="blue",lty=1)
legend(0.4,0.4,legend=c("ctDNA = 0", "all samples","ctDNA above 0","ctDNA above 0.05"), col=c("purple","red","blue","green"), lty = 1)

ci.se(g,specificities = c(1))
## Warning in ci.se.roc(g, specificities = c(1)): ci.se() of a ROC curve with
## AUC == 1 is always a null interval and can be misleading.
## 95% CI (2000 stratified bootstrap replicates):
##  sp se.low se.median se.high
##   1      1         1       1